How to Find My Bar in New York?

spatial
analysis
Author

Tianxiao Chen

Published

December 20, 2023

Introduction

After a busy week at work why not head to New York for a great weekend! But because there are so many options in New York, choosing where to stay and where to go for the night can sometimes be a difficult decision. Therefore, this analysis aims to analyze and visualize the data of Airbnb and bars in New York to provide a guide for future visitors to New York.

The dataset I would use in the project includes following: 1. New York City Airbnb Open Data. The dataset includes the features, prices, and location of the room. It will be the main dataset that for final Airbnb selection. 2. 2016 Parties in New York. The dataset includes the Location of the bar and the number of noise record for the bars. It will identify the number of entertainment venues in the vicinity of each site and the likely noise levels. 3. Uber picks up in New York City. The dataset includes the the pick up location and time of Uber. This dataset demonstrates the ease of travel behavior. 4. Census Data. The data will include the basic regional unit for discussion in the new step suggestion for visitors to choose their preferred airbnb.

Code
%env MYPATH=C:/Folder Name/file.txt

import pandas as pd
import os
import numpy as np
import geopandas as gpd
from shapely.geometry import Point

import folium
import xyzservices
import panel as pn

import datetime
import time

import seaborn as sns
from matplotlib import pyplot as plt
import holoviews as hv
import hvplot.pandas
import contextily as ctx
import geoviews as gv
import geoviews.tile_sources as gvts

import altair as alt
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.preprocessing import StandardScaler

import requests
from sodapy import Socrata
import missingno as msno
from scipy.stats import gaussian_kde

import osmnx as ox
import folium
import altair as alt
from wordcloud import WordCloud
Code
# load dataset from google drive 
url_basic = 'https://drive.google.com/uc?id='
air='https://drive.google.com/file/d/1b0Hih_3K-xS3DZ6YKPSM_97yrGPEYlHE/view?usp=sharing'
url_air= url_basic + air.split('/')[-2]

uber = 'https://drive.google.com/file/d/1p_BO6Kd_R-dRGLliP8cDBdC01pF2OqmJ/view?usp=sharing'
url_uber= url_basic + uber.split('/')[-2]

bar = 'https://drive.google.com/file/d/1gLf9IyT6Vy2_ZiWfaQvOXRMMwOWPog3z/view?usp=sharing'
url_bar = url_basic + bar.split('/')[-2]

parties = 'https://drive.google.com/file/d/1U0eZyg1UhuCSpDobZZgClj1v1STCfNvM/view?usp=sharing'
url_prt = url_basic + parties.split('/')[-2]

parties_test = 'https://drive.google.com/file/d/1LVlJ64Wvj-p43AGpkH09BzUtJJeIAVwt/view?usp=sharing'
url_prt_test =  url_basic + parties_test.split('/')[-2]

parties_train = 'https://drive.google.com/file/d/1ww_K4UF-xSagwnqz7nNoKH_Ojv0QL9Iy/view?usp=sharing'
url_prt_train =  url_basic + parties_train.split('/')[-2]

boundary = 'https://drive.google.com/file/d/1nZz5GG3pPcNhNcvQYA_DTAaKAI5w6J-f/view?usp=sharing'
url_bund =  url_basic + boundary.split('/')[-2]

nbhd = 'https://drive.google.com/file/d/1hI840aCWK2vbam-6SVT-NqTBM0BkLaY5/view?usp=sharing'
url_nbhd =  url_basic + nbhd.split('/')[-2]

air_df = pd.read_csv(url_air)
uber_df = pd.read_csv(url_uber,parse_dates=['Date/Time'])
bar_df = pd.read_csv(url_bar)
prt_df = pd.read_csv(url_prt,parse_dates=['Created Date','Closed Date'])
prt_test_df = pd.read_csv(url_prt_test)
prt_train_df = pd.read_csv(url_prt_train)
bdry_gdf = gpd.read_file(url_bund,crs='EPSG:4326')
nbhd_gdf = gpd.read_file(url_nbhd,crs='EPSG:4326')
Code
# airbnb dataset clean
dropy = ['id','host_id','host_name','last_review']
air_df = air_df.drop(dropy,axis=1)
air_df['reviews_per_month'] = air_df['reviews_per_month'].fillna(0)
air_df = air_df.dropna()
air_df = air_df[air_df['price'] <= 4000]
air_gdf = gpd.GeoDataFrame(air_df, geometry=[Point(xy) for xy in zip(air_df.longitude, air_df.latitude)])
air_gdf.set_crs(epsg=4326, inplace=True)
name neighbourhood_group neighbourhood latitude longitude room_type price minimum_nights number_of_reviews reviews_per_month calculated_host_listings_count availability_365 geometry
0 Clean & quiet apt home by the park Brooklyn Kensington 40.64749 -73.97237 Private room 149 1 9 0.21 6 365 POINT (-73.97237 40.64749)
1 Skylit Midtown Castle Manhattan Midtown 40.75362 -73.98377 Entire home/apt 225 1 45 0.38 2 355 POINT (-73.98377 40.75362)
2 THE VILLAGE OF HARLEM....NEW YORK ! Manhattan Harlem 40.80902 -73.94190 Private room 150 3 0 0.00 1 365 POINT (-73.94190 40.80902)
3 Cozy Entire Floor of Brownstone Brooklyn Clinton Hill 40.68514 -73.95976 Entire home/apt 89 1 270 4.64 1 194 POINT (-73.95976 40.68514)
4 Entire Apt: Spacious Studio/Loft by central park Manhattan East Harlem 40.79851 -73.94399 Entire home/apt 80 10 9 0.10 1 0 POINT (-73.94399 40.79851)
... ... ... ... ... ... ... ... ... ... ... ... ... ...
48890 Charming one bedroom - newly renovated rowhouse Brooklyn Bedford-Stuyvesant 40.67853 -73.94995 Private room 70 2 0 0.00 2 9 POINT (-73.94995 40.67853)
48891 Affordable room in Bushwick/East Williamsburg Brooklyn Bushwick 40.70184 -73.93317 Private room 40 4 0 0.00 2 36 POINT (-73.93317 40.70184)
48892 Sunny Studio at Historical Neighborhood Manhattan Harlem 40.81475 -73.94867 Entire home/apt 115 10 0 0.00 1 27 POINT (-73.94867 40.81475)
48893 43rd St. Time Square-cozy single bed Manhattan Hell's Kitchen 40.75751 -73.99112 Shared room 55 1 0 0.00 6 2 POINT (-73.99112 40.75751)
48894 Trendy duplex in the very heart of Hell's Kitchen Manhattan Hell's Kitchen 40.76404 -73.98933 Private room 90 7 0 0.00 1 23 POINT (-73.98933 40.76404)

48847 rows × 13 columns

Code
# party dataset clean
prt_df['duration'] = (prt_df['Closed Date'].view('int64') // 10**9 - prt_df['Created Date'].view('int64') // 10**9)/60
prt_df['hour'] = prt_df['Closed Date'].dt.hour
prt_df['dow'] = prt_df['Created Date'].dt.weekday
prt_df['date'] = prt_df['Created Date'].dt.date
prt_gdf = gpd.GeoDataFrame(prt_df, geometry=[Point(xy) for xy in zip(prt_df.Longitude, prt_df.Latitude)])
prt_gdf.set_crs(epsg=4326, inplace=True)
prt_gdf = prt_gdf[(prt_gdf['duration'] > 0) & (prt_gdf['duration'] < 480)]
prt_gdf.head(5)
Created Date Closed Date Location Type Incident Zip City Borough Latitude Longitude duration hour dow date geometry
0 2015-12-31 00:01:15 2015-12-31 03:48:04 Store/Commercial 10034.0 NEW YORK MANHATTAN 40.866183 -73.918930 226.816667 3.0 3 2015-12-31 POINT (-73.91893 40.86618)
1 2015-12-31 00:02:48 2015-12-31 04:36:13 Store/Commercial 10040.0 NEW YORK MANHATTAN 40.859324 -73.931237 273.416667 4.0 3 2015-12-31 POINT (-73.93124 40.85932)
2 2015-12-31 00:03:25 2015-12-31 00:40:15 Residential Building/House 10026.0 NEW YORK MANHATTAN 40.799415 -73.953371 36.833333 0.0 3 2015-12-31 POINT (-73.95337 40.79942)
3 2015-12-31 00:03:26 2015-12-31 01:53:38 Residential Building/House 11231.0 BROOKLYN BROOKLYN 40.678285 -73.994668 110.200000 1.0 3 2015-12-31 POINT (-73.99467 40.67829)
4 2015-12-31 00:05:10 2015-12-31 03:49:10 Residential Building/House 10033.0 NEW YORK MANHATTAN 40.850304 -73.938516 224.000000 3.0 3 2015-12-31 POINT (-73.93852 40.85030)
Code
uber_df['hour'] = uber_df['Date/Time'].dt.hour
uber_df['dow'] = uber_df['Date/Time'].dt.weekday
uber_df['date'] = uber_df['Date/Time'].dt.date
uber_gdf = gpd.GeoDataFrame(uber_df, geometry=[Point(xy) for xy in zip(uber_df.Lon, uber_df.Lat)])
uber_gdf.set_crs(epsg=4326, inplace=True)
uber_gdf.head(5)
Date/Time Lat Lon Base hour dow date geometry
0 2014-06-01 00:00:00 40.7293 -73.9920 B02512 0 6 2014-06-01 POINT (-73.99200 40.72930)
1 2014-06-01 00:01:00 40.7131 -74.0097 B02512 0 6 2014-06-01 POINT (-74.00970 40.71310)
2 2014-06-01 00:04:00 40.3461 -74.6610 B02512 0 6 2014-06-01 POINT (-74.66100 40.34610)
3 2014-06-01 00:04:00 40.7555 -73.9833 B02512 0 6 2014-06-01 POINT (-73.98330 40.75550)
4 2014-06-01 00:07:00 40.6880 -74.1831 B02512 0 6 2014-06-01 POINT (-74.18310 40.68800)
Code
bar_gdf = gpd.GeoDataFrame(bar_df, geometry=[Point(xy) for xy in zip(bar_df.Longitude, bar_df.Latitude)])
bar_gdf.set_crs(epsg=4326, inplace=True)
bar_gdf = bar_gdf[bar_gdf['num_calls']<250]
bar_gdf.head(5)
Location Type Incident Zip City Borough Latitude Longitude num_calls geometry
0 Club/Bar/Restaurant 10308.0 STATEN ISLAND STATEN ISLAND 40.544096 -74.141155 40 POINT (-74.14115 40.54410)
1 Club/Bar/Restaurant 10012.0 NEW YORK MANHATTAN 40.729793 -73.998842 18 POINT (-73.99884 40.72979)
2 Club/Bar/Restaurant 10308.0 STATEN ISLAND STATEN ISLAND 40.544209 -74.141040 21 POINT (-74.14104 40.54421)
3 Club/Bar/Restaurant 10034.0 NEW YORK MANHATTAN 40.866376 -73.928258 160 POINT (-73.92826 40.86638)
4 Club/Bar/Restaurant 11220.0 BROOKLYN BROOKLYN 40.635207 -74.020285 17 POINT (-74.02028 40.63521)

Exploratory Analysis

Where should I live? – Airbnb Analysis & Visualization

Code
# group by bonough
nbhd_price = air_df.groupby('neighbourhood_group')['price'].mean()
nbhd_price  = nbhd_price .reset_index()
bdry_price = bdry_gdf.merge(nbhd_price,left_on='boro_name', right_on='neighbourhood_group', how='inner')
Code
# group by neighborhood
air_cen_gdf =air_gdf.sjoin(nbhd_gdf, how='inner')
census_price = air_cen_gdf.groupby('ntacode')['price'].mean()
census_price = census_price.reset_index()
census_review = air_cen_gdf.groupby('ntacode')['number_of_reviews'].mean()
census_review = census_review.reset_index()
census_rm = air_cen_gdf.groupby('ntacode')['reviews_per_month'].mean()
census_rm = census_rm.reset_index()
census_pr = census_price.merge(census_review,on='ntacode')
census_prm = census_pr.merge(census_rm,on = 'ntacode')
census_prm = nbhd_gdf.merge(census_prm[['price','number_of_reviews','reviews_per_month','ntacode']],on='ntacode')

To better understand the airbnb situation in New York, I firstly visualized the statistical distribution of Airbnb data. We were able to find a distribution of prices that, with the exception of some of the higher-priced listings, was close to a normal distribution for most of the homes, with a concentration in the $20-$500 a night range. As for the number of reviews, We were able to find that the vast majority of listings received lower reviews. When it comes to room type, the ‘entire room’ and ‘private room’ took the major part. What’s more, most home located in Manhattan and Brooklyn, which can be explained by the fact that Brooklyn and Manhattan have most of New York’s places to hang out.

Code
# airbnb situation in New York
_,axss = plt.subplots(2,2,figsize = [20,8])
sns.histplot(air_df['price'], bins=80, kde=False, color='#2a9d8f',ax = axss[0,0])
sns.histplot(air_df['number_of_reviews'], bins=100, kde=False, color='#2a9d8f',ax = axss[0,1])
sns.countplot(x='room_type', data=air_df,ax = axss[1,0])
sns.countplot(x='neighbourhood_group', data=air_df,ax = axss[1,1])
plt.show()

To better understand Airbnb’s geographic distribution patterns on New York, we used map visualizations for further analysis. The spatial distribution of locations shows that the density of listings gradually decreases in all directions, centered on Manhattan, while prices show an accumulation of higher prices at the center of density. Looking at average home prices in the greater region, Manhattan and Brooklyn have the first and second highest.

Code
fig, ax = plt.subplots(figsize=(8, 5), facecolor="#e5e7eb")

# Plot
air_gdf.plot(
    ax=ax,
    column="price",
    edgecolor="black",
    linewidth=0.1,
    legend=True,
    legend_kwds=dict(loc="lower right", fontsize=10),
    cmap="Reds",
    markersize=2,
    scheme="Quantiles",
    k=5,
)

ax.set_title("Airbnb Price Location in NY")
ax.set_axis_off()
ax.set_aspect("equal")

Code
m = bdry_price.explore(column="price", scheme="FisherJenks")
m
Make this Notebook Trusted to load map: File -> Trust Notebook

And when we look at the distribution of prices and reviews for Audemars Piguet at a smaller neighborhood scale, we are able to see that Audemars Piguet’s prices are also gradually decreasing in all directions, centered on Upper Manhattan. As for the reviews of the listings, we were able to find that the number of reviews in the higher priced areas is relatively low. The lower priced areas have a higher number of reviews as well as a higher average number of reviews per month. This can be explained by the fact that higher priced homes have a relatively smaller audience and less affordable people.

Code
m = census_prm.explore(column="number_of_reviews", scheme="FisherJenks")
m
Make this Notebook Trusted to load map: File -> Trust Notebook
Code
m = census_prm.explore(column="price", scheme="FisherJenks")
m
Make this Notebook Trusted to load map: File -> Trust Notebook

Overall, the price of a listing and more comprehensive information is overall not available at the same time. Those who are not price-sensitive have more options located in Mankato as well as Northwest Brooklyn. For a more cost-effective and comprehensive option, consider listings in the Bronx and Queens!

Where Can I have fun? – Entertainment Analysis & Visualization

In addition to accommodation options, we also paid equal attention to what ‘post-fun’ places there are to choose from in New York outside of everyday play. So why not go to a bar? From the point of view of the distribution of bars in New York, most of the bars are still concentrated in Manhattan, where the most prosperous business activities and nightlife in New York. However, when we look at complaints, we find that the average number of complaints received by bars in areas other than the Bronx is about the same, with Queens having the highest average, which is probably due to the fact that Queens itself is a large neighborhood. And as noisy areas are accompanied by disturbances at night, areas near bars with high complaints should be avoided as much as possible when choosing an Airbnb.

Code
bar_call = bar_gdf.groupby('Borough')['num_calls'].mean()
bar_call = bar_call.reset_index()
bar_call['Borough'] = bar_call['Borough'].str.lower()
bar_count = bar_gdf.groupby('Borough').count()
bar_count = bar_count.reset_index()
bar_count = bar_count[['Borough','City']]
bar_count['Borough'] = bar_count['Borough'].str.lower()
bar_count.columns = ['Borough', 'Count']
bar_cc = bar_call.merge(bar_count,on='Borough')
bdry_gdf['boro_name'] = bdry_gdf['boro_name'].str.lower()
bar_bd = bdry_gdf.merge(bar_cc,left_on='boro_name', right_on='Borough', how='inner')
Code
fig, ax = plt.subplots(figsize=(8, 5), facecolor="#e5e7eb")

# Plot
bar_gdf.plot(
    ax=ax,
    column="num_calls",
    edgecolor="black",
    linewidth=0.1,
    legend=True,
    legend_kwds=dict(loc="lower right", fontsize=10),
    cmap="Reds",
    markersize=2,
    scheme="Quantiles",
    k=5,
)

ax.set_title("Bar Complaints in NY")
ax.set_axis_off()
ax.set_aspect("equal")

Code
_,axss = plt.subplots(2,figsize = [20,10])
sns.barplot(x = 'boro_name', y ='num_calls',data = bar_bd,ax = axss[1])
sns.barplot(x = 'boro_name', y ='Count',data = bar_bd,ax = axss[0])
<Axes: xlabel='boro_name', ylabel='Count'>

Code
bar_cen_gdf =bar_gdf.sjoin(nbhd_gdf, how='inner')
bar_call = bar_cen_gdf.groupby('ntacode')['num_calls'].mean()
bar_call = bar_call.reset_index()
bar_call_sum = bar_cen_gdf.groupby('ntacode').count()
bar_call_sum = bar_call_sum.reset_index()
bar_call_sum = bar_call_sum[['ntacode','City']]
cen_bar = nbhd_gdf.merge(bar_call,on='ntacode')
cen_bar = cen_bar.merge(bar_call_sum,on='ntacode')
Code
bar_cc = bar_call.merge(bar_call_sum,on='ntacode')
bar_cc.head(4)
ntacode num_calls City
0 BK09 12.000000 4
1 BK17 37.714286 7
2 BK19 58.800000 5
3 BK21 14.000000 1
Code
m = cen_bar.explore(column="num_calls", scheme="FisherJenks")
m
Make this Notebook Trusted to load map: File -> Trust Notebook
Code
m = cen_bar.explore(column="City", scheme="FisherJenks")
m
Make this Notebook Trusted to load map: File -> Trust Notebook

And when we looked at where and when parties were held in New York, we were able to see that the most parties were held in residential buildings, and the vast majority of parties ended between nighttime hours and 5 a.m. the next day, which means that most of the venues where parties took place probably weren’t a good choice for an Airbnb location!

Code
_,axss = plt.subplots(2,2,figsize = [20,10])
sns.countplot(x = 'dow',data = prt_gdf,ax=axss[0,0])
sns.countplot(x = 'hour',data = prt_gdf,ax=axss[0,1])
sns.histplot(prt_gdf['duration'],bins=80, kde=False, color='#2a9d8f',ax = axss[1,0])
sns.countplot(x = 'Location Type',data = prt_gdf,ax=axss[1,1])
<Axes: xlabel='Location Type', ylabel='count'>

How to Choose My Airbnb – Evaluation Data Modeling

After getting an overview of Airbnb and bar entertainment in New York, we wanted to model the choices for different visitor needs. First, we modeled the behavior of a tourist who wants an Airbnb that is not too expensive, does not have too many loud bars or too many parties in the vicinity of his home, and has a certain number of bars or parties within a certain distance to ensure that he can have a ‘last drink before going home’ in the vicinity of his home. Based on such needs, we constructed two levels of data.

The first is based on the ‘residential parameters’ of neighborhood as a unit:

  1. the average ending time of parties within a certain radius of the residence;
  2. the average number of complaints filed against bars within a certain radius of the residence;

and the second is based on the ‘recreational parameters’ of neighborhood as a unit “:

  1. the number of bars within a certain range around the residence;
  2. the number of Uber pickups around the bars (since the density of the distribution of vehicles varies during part of the night, taking into account Uber’s own algorithms for the scheduling of the vehicles, the availability of enough cars in the area around the bars is also a factor to be weighed)

The final element for judging the community consists of the following variables.

e.g.Initially, it was intended to use Airbnb rooms as the unit of discussion, but in the subsequent construction of the metrics, it was found that when the dataset is too large using the buffer to replenish the points around the coordinates of each point can easily cause the program to crash, so in the subsequent improvement, if this problem can be solved, it will be able to better go to the selection of the room.

Code
def calculate_mode(group):
    return group.mode().iloc[0]
Code
prt_nb_gdf =nbhd_gdf.sjoin(prt_gdf, how='inner')
prt_sum = prt_nb_gdf.groupby('ntacode').count()
prt_sum = prt_sum.reset_index()
prt_sum = prt_sum[['ntacode','shape_area']]
prt_mean = prt_nb_gdf.groupby('ntacode')['hour'].apply(calculate_mode).reset_index(name='Mode')
prt_time = prt_nb_gdf.groupby('ntacode')['duration'].mean()
prt_time = prt_time.reset_index()
prt_tt = prt_sum.merge(prt_mean,on='ntacode')
prt_total = prt_tt.merge(prt_time,on='ntacode')
prt_nb_gdf = nbhd_gdf.merge(prt_total,on='ntacode')
pb_nb_gdf = prt_nb_gdf.merge(bar_cc,on='ntacode')
pb_nb_gdf = pb_nb_gdf[['ntacode','shape_area_y','Mode','duration','num_calls','City','geometry','ntaname']]
un_gdf = uber_gdf[(uber_gdf['hour']>=20)|(uber_gdf['hour']<7)]
un_nb_gdf =nbhd_gdf.sjoin(un_gdf, how='inner')
un_sum = un_nb_gdf.groupby('ntacode').count()
un_sum = un_sum.reset_index()
un_sum = un_sum[['ntacode','shape_area']]
pbu_nb_gdf = pb_nb_gdf.merge(un_sum,on='ntacode')
pbu_nb_gdf.rename(columns={'shape_area_y': 'num_party','Mode': 'time_party','City': 'num_bar','shape_area':'num_uber'}, inplace=True)
census_prm = census_prm[['ntacode','price','number_of_reviews','reviews_per_month','boro_name']]
pbua_nb_gdf = pbu_nb_gdf.merge(census_prm,on='ntacode')
pbua_nb_gdf.head(2)
ntacode num_party time_party duration num_calls num_bar geometry ntaname num_uber price number_of_reviews reviews_per_month boro_name
0 QN08 816 3.0 163.680351 83.5 4 MULTIPOLYGON (((-73.75205 40.70523, -73.75174 ... St. Albans 27 98.709302 39.104651 2.301744 Queens
1 BX28 2161 23.0 157.323778 18.0 7 MULTIPOLYGON (((-73.88705 40.88435, -73.88705 ... Van Cortlandt Village 31 83.115385 29.788462 1.640769 Bronx

Based on the constructed metrics, we first perform a cluster analysis to find out if the metrics have any geographical commonalities. For model selection, we use the Kmeans model. We analyze this from three perspectives-living, playing, and traveling. In the previous analysis, we already have a preliminary knowledge of the three dimensions, so in the cluster analysis, we try to conduct a cross analysis of the combination of dimensions to obtain the choice of different needs.

Code
def kmn(co_ls):
    kmeans = KMeans(n_clusters=5, random_state=42, n_init=10)
    scaler = StandardScaler()
    scale_pbua = scaler.fit_transform(pbua_nb_gdf[co_ls])
    kmeans.fit(scale_pbua);
    return kmeans.labels_
Code
all_label = kmn(['num_party', 'time_party', 'duration','num_calls',
            'num_bar','num_uber','price','number_of_reviews','reviews_per_month'])
travel_bar = kmn(['num_uber','num_bar','num_party','duration'])
bar_air = kmn(['num_party','num_bar','price','number_of_reviews','reviews_per_month'])
peace_air = kmn(['time_party', 'duration','num_calls','price','number_of_reviews','reviews_per_month'])
pbua_nb_gdf['all_label'] = all_label
pbua_nb_gdf['travel_bar'] = travel_bar
pbua_nb_gdf['bar_air'] = bar_air
pbua_nb_gdf['peace_air'] = peace_air

Where Can I Find a Bar? – Clustering Visualization

Based on cluster analysis, we are able to provide travelers with Airbnb location options that meet their needs for different needs. For example, if our traveler is a person who does not require a lot of accommodation but wants to go to a bar in the evening and wants to take a taxi home quickly, we analyzed the clustering of bar and Uber related metrics and found that areas with label 3 and 4 are very suitable for the location where he/she is going to visit.

Code
pbua_nb_gdf.groupby("travel_bar", as_index=False)[['num_uber','num_bar','num_party','duration']].mean().sort_values(by="travel_bar")
travel_bar num_uber num_bar num_party duration
0 0 809.397436 9.487179 780.089744 142.621888
1 1 100.377049 6.524590 733.622951 205.199978
2 2 763.187500 27.875000 3645.875000 149.463024
3 3 22529.000000 69.500000 2057.250000 119.591318
4 4 5410.800000 106.000000 2492.000000 147.806346
Code
# setup the figure
f, ax = plt.subplots(figsize=(10, 8))

# plot, coloring by label column
# specify categorical data and add legend
pbua_nb_gdf.plot(
    column="travel_bar",
    cmap="Dark2",
    categorical=True,
    legend=True,
    edgecolor="k",
    lw=0.5,
    ax=ax,
)


ax.set_axis_off()
plt.axis("equal");

Where Can I Find A Quite Airbnb? – Clustering Visualization

Let’s take another example. Emily wants to spend a nice weekend in New York City, but she wants to avoid too many bars in the neighborhood because they are loud and potentially dangerous. On the other hand, Emily doesn’t have a big budget, so she doesn’t want to spend too much money on Airbnb. From the cluster analysis of ‘Airbnb-Bar’, we can find that the area represented by cluster 2 meets Emily’s needs. Overall, from the map, the intersection of Bronx Grove and Queens would be a great residential option for Emily.

Code
pbua_nb_gdf.groupby("bar_air", as_index=False)[['num_party','num_calls','num_bar','price','number_of_reviews','reviews_per_month']].mean().sort_values(by="bar_air")
bar_air num_party num_calls num_bar price number_of_reviews reviews_per_month
0 0 3594.937500 39.004224 29.125000 112.422663 24.804384 1.061521
1 1 933.514286 34.081496 5.942857 91.453277 38.928529 2.204802
2 2 2465.166667 33.000383 107.333333 180.279458 23.902023 1.028439
3 3 994.100000 27.607487 24.800000 209.540961 18.689871 0.836661
4 4 686.850575 32.515925 6.643678 95.245483 19.849280 1.219340
Code
# setup the figure
f, ax = plt.subplots(figsize=(10, 8))

# plot, coloring by label column
# specify categorical data and add legend
pbua_nb_gdf.plot(
    column="bar_air",
    cmap="Dark2",
    categorical=True,
    legend=True,
    edgecolor="k",
    lw=0.5,
    ax=ax,
)


ax.set_axis_off()
plt.axis("equal");

Interactive Airbnb Location Selection Tools

And similarly, we provide an interactive map of the specific locations of Airbnb’s in the community, where visitors can see the specific prices and number and frequency of reviews of Airbnb’s in their preferred neighborhood, which can further help them make the right choice for them.

Code
pn.extension("tabulator")
Code
air = air_gdf.sjoin(pbua_nb_gdf,how='inner')
Code
nt_names = list(pbua_nb_gdf['ntaname'].unique())

neighborhoodSelect = pn.widgets.Select(
    value="St. Albans", options=nt_names, name="Neighborhood"
)

neighborhoodSelect
Code
def filter_by_neighborhood(data, neighborhood_name):
    sel = data["ntaname"] == neighborhood_name
    return data.loc[sel]

def airbnb_data(data, neighborhood_name):
    sel = nbhd_gdf["ntaname"] == neighborhood_name
    hood_geo = nbhd_gdf.loc[sel]

    m = hood_geo.explore(
        style_kwds={"weight": 4, "color": "black", "fillColor": "none"},
        name="Neighborhood boundary",
        tiles=xyzservices.providers.CartoDB.Voyager,
    )

    data.explore(
        m=m,  # Add to the existing map!
        marker_kwds={"radius": 7, "fill": True, "color": "crimson"},
        marker_type="circle_marker",  # or 'marker' or 'circle'
        name="Tickets",
    )
    return m

def create_dashboard_1(neighborhood_name):
    tickets = filter_by_neighborhood(air, neighborhood_name)
    m = airbnb_data(tickets, neighborhood_name)
    return pn.pane.plot.Folium(m, height=600)
Code
ticket_dashboard_1 = pn.Column(
    pn.Column("## Airbnb in Your Neighborhood", neighborhoodSelect),
    # Add a height spacer
    pn.Spacer(height=45),
    # Bottom: the main chart, bind widgets to the function
    pn.bind(create_dashboard_1, neighborhood_name=neighborhoodSelect),
)

ticket_dashboard_1

General Feature of Airbnb in One Neighborhood

Based on the above analysis, we have been able to provide different tourists with the range of Airbnb’s they need for their choice of community. But again, the generalization about Airbnb’s within such a community is something we would like to describe to visitors. Therefore, we have selected the names of Airbnb within the community range (as they contain some attractive features of the listings) for word cloud analysis to get the common features of Airbnb within a community to help the tourists in further screening.

Code
def fnstring(name):
    fi = air.loc[air['ntaname'] == name]
    str_list = fi['name'].astype(str).tolist()
    combined_string = ' '.join(str_list)
    return combined_string
Code
def wcloud(name):
    wc = WordCloud(
    background_color="black", max_words=100, width=1000, height=500, colormap="tab20c"
)
    text = fnstring(name)
    img = wc.generate(text)
    fig, ax = plt.subplots()
    ax.imshow(img, interpolation="bilinear")
    ax.set_axis_off()
    plt.show();
Code
temp = pn.Column(
    pn.Column("## Airbnb in Your Neighborhood", neighborhoodSelect),
    # Add a height spacer
    pn.Spacer(height=45),
    # Bottom: the main chart, bind widgets to the function
    pn.bind(wcloud, name=neighborhoodSelect),
)
temp